import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from joblib import dump, load
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.float_format', lambda x: '%.3f' % x)
data = pd.read_csv('historical_data.csv')
data.shape
data.describe()
# Change data type to date and create a new col: delivery_time_secs which would be the model target
# Convert UTC to PST just to understand the data more in local time
data['created_at'] = pd.to_datetime(data['created_at']).dt.tz_localize('utc').dt.tz_convert('US/Pacific')
data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time']).dt.tz_localize('utc').dt.tz_convert('US/Pacific')
data['delivery_time_secs'] = (data['actual_delivery_time'] - data['created_at']).dt.total_seconds()
# Create additional date-time features based on created_at
data['hour'] = data['created_at'].dt.hour
data['day'] = data['created_at'].dt.dayofweek
# Different Buckets: Early morn, breakfast, lunch, snack, dinner etc
# 0-6 means: starting from 00:00 to 05:59
data['hour_0_6'] = data['hour'].map(lambda x: 1 if 0 <= x < 6 else 0)
data['hour_6_11'] = data['hour'].map(lambda x: 1 if 6 <= x < 11 else 0)
data['hour_11_15'] = data['hour'].map(lambda x: 1 if 11 <= x < 15 else 0)
data['hour_15_18'] = data['hour'].map(lambda x: 1 if 15 <= x < 18 else 0)
data['hour_18_23'] = data['hour'].map(lambda x: 1 if 18 <= x < 23 else 0)
data['avg_price_item'] = data['subtotal'] / data['total_items']
data['avg_price_distinct_item'] = data['subtotal'] / data['num_distinct_items']
data.head()
data.dtypes
data['delivery_time_secs'].describe()
data['delivery_time_secs'].quantile([ i*1.0/100 for i in range(0,100,5)])
data[data['delivery_time_secs'] > 3600*12].sort_values(by=['delivery_time_secs'],ascending=False)
data.loc[data['delivery_time_secs'] <= 4872, ['delivery_time_secs','hour']].groupby('hour').mean().plot(kind='bar')
# Outliers in terms of delivery time secs
print(data.shape)
for i in range(2,10+1):
print('More than', i, 'hours',data.loc[data['delivery_time_secs'] > 3600*i].shape)
# plot the heatmap of corr coeffs to understand how each predictor is corr with target and with each other
corr = data.corr()
fig, ax = plt.subplots(figsize=(20,20)) # Sample figsize in inches
sns.heatmap(corr,
xticklabels=corr.columns,
yticklabels=corr.columns,
annot=True,
ax=ax)
cols = ['market_id',
'store_id',
'day',
'hour',
'estimated_store_to_consumer_driving_duration',
'total_onshift_dashers',
'total_busy_dashers',
'total_outstanding_orders',
'subtotal',
'delivery_time_secs']
sns.pairplot(data[cols])
fig = plt.figure(figsize=(10, 5), dpi=80)
ax = fig.add_subplot(2, 3, 1)
sns.regplot(x="hour", y="delivery_time_secs", data=data, ax=ax)
ax = fig.add_subplot(2, 3, 2)
sns.regplot(x="day", y="delivery_time_secs", data=data, ax=ax)
ax = fig.add_subplot(2, 3, 3)
sns.regplot(x="total_onshift_dashers", y="delivery_time_secs", data=data, ax=ax)
ax = fig.add_subplot(2, 3, 4)
sns.regplot(x="market_id", y="delivery_time_secs", data=data, ax=ax)
ax = fig.add_subplot(2, 3, 5)
sns.regplot(x="store_id", y="delivery_time_secs", data=data, ax=ax)
ax = fig.add_subplot(2, 3, 6)
sns.regplot(x="total_outstanding_orders", y="delivery_time_secs", data=data, ax=ax)
data.isna().sum()
#These records can be dropped?
print(data.shape)
data.drop(data[data['delivery_time_secs'].isna()].index, inplace=True)
print(data.shape)
data[data['market_id'].isna()].head()
data['market_id'].value_counts()
# Create a market: unknown (id=99)
data['market_id'] = np.where(data['market_id'].isna(), 99, data['market_id'])
data['market_id'].value_counts()
data[data['store_primary_category'].isna()].head()
# Set to "other" wherever NA
data['store_primary_category'] = np.where(data['store_primary_category'].isna(), 'other', data['store_primary_category'])
data['store_primary_category'] = data['store_primary_category'].astype('category')
data['store_primary_category_cat'] = data['store_primary_category'].cat.codes
data[data['order_protocol'].isna()].head()
# Create a order_protocol: unknown (id=99)
data['order_protocol'] = np.where(data['order_protocol'].isna(), 99, data['order_protocol'])
data['order_protocol'].value_counts()
data.loc[data['total_onshift_dashers'].isna()].head()
"""
Create 3 maps:
key: ('market_id','store_id','day','hour')
value: average of total_onshift_dashers, total_busy_dashers or total_outstanding_orders
"""
dashers_map = \
data[['market_id','store_id','day','hour','total_onshift_dashers','total_busy_dashers','total_outstanding_orders']] \
.groupby(['market_id','store_id','day','hour']).mean().to_dict()
print(dashers_map.keys())
total_onshift_dashers_map = dashers_map['total_onshift_dashers']
total_busy_dashers_map = dashers_map['total_busy_dashers']
total_outstanding_orders_map = dashers_map['total_outstanding_orders']
for k,v in total_onshift_dashers_map.items():
print(k,v)
break
for k,v in total_busy_dashers_map.items():
print(k,v)
break
for k,v in total_outstanding_orders_map.items():
print(k,v)
break
# Replace the NAs with average store to consumer driving duration
def total_onshift_dashers_missing(row):
ret = row['total_onshift_dashers']
if np.isnan(ret):
ret = total_onshift_dashers_map[(row['market_id'],row['store_id'],row['day'],row['hour'])]
return ret
print(data[data['total_onshift_dashers'].isna()].shape)
"""
data['total_onshift_dashers'] = \
data[['market_id','store_id','day','hour','total_onshift_dashers']] \
.apply(total_onshift_dashers_missing, axis=1)
"""
df_missing_dashers = data.loc[data['total_onshift_dashers'].isna()]
df_missing_dashers['hour'].value_counts().sort_index().plot(kind='bar')
df_missing_dashers['delivery_time_secs'].quantile([ i*1.0/100 for i in range(0,100,5)])
# Fill these cols with 0 (for now)
data[['total_onshift_dashers','total_busy_dashers','total_outstanding_orders']] = \
data[['total_onshift_dashers','total_busy_dashers','total_outstanding_orders']].fillna(0)
# Missing value at this point (Base line Data set)
# Only estimated_store_to_consumer_driving_duration has missing values (to be taken care at modeling stage)
data.isna().sum()
data.dtypes
def train_test_split_df(dataframe):
# Copy dataframe
data_processed = dataframe.copy(deep=True)
y = data_processed['delivery_time_secs']
drop_cols = ['created_at','actual_delivery_time','delivery_time_secs','store_primary_category']
X = data_processed.drop(drop_cols, axis=1)
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X, y, test_size=0.25, random_state=43)
print(X_train_df.shape, y_train_df.shape)
print(X_test_df.shape, y_test_df.shape)
return X_train_df, X_test_df, y_train_df, y_test_df, data_processed.columns
def train_test_split_np(dataframe):
# Copy dataframe
data_processed = dataframe.copy(deep=True)
y = data_processed['delivery_time_secs'].values
drop_cols = ['created_at','actual_delivery_time','delivery_time_secs','store_primary_category']
data_processed.drop(drop_cols, axis=1, inplace=True)
X = data_processed.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=43)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
return X_train, X_test, y_train, y_test, data_processed.columns
def train_X_y(dataframe):
# Copy dataframe
data_processed = dataframe.copy(deep=True)
y = data_processed['delivery_time_secs'].values
drop_cols = ['created_at','actual_delivery_time','delivery_time_secs','store_primary_category']
data_processed.drop(drop_cols, axis=1, inplace=True)
X = data_processed.values
print(X.shape, y.shape)
return X, y, data_processed.columns
def train_model(X_train, y_train):
# Log Transform the target
y_train = np.log(y_train)
param_grid = {
'n_estimators': [200],
'min_samples_split': [500],
'min_samples_leaf': [100],
'max_depth': [7]
}
gbdt_reg = GradientBoostingRegressor(verbose=10)
print(gbdt_reg)
grid_search = GridSearchCV(gbdt_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
print('# Best Params', grid_search.best_params_)
print('# Best Score', grid_search.best_score_)
# Model Score
print('# Mean Squared Error (Best Score)', np.exp(np.sqrt(-grid_search.best_score_)), 'secs')
return grid_search.best_estimator_
def predict(estimator, X_test, y_test):
#Model Prediction on Test Set
y_hat = estimator.predict(X_test)
# Log Transform y_test
y_test = np.log(y_test)
mse = mean_squared_error(y_test, y_hat)
mae = mean_absolute_error(y_test, y_hat)
print('MSE', mse)
print('MAE', mae)
print('# Prediction Error (RMSE)', np.exp(np.sqrt(mse)), 'secs')
print('# Prediction Error (MAE)', np.exp(mae), 'secs')
def train_model_orig(dataframe):
# Train on copied dataframe
data_processed = dataframe.copy(deep=True)
# Log Transform the target
y = np.log(data_processed['delivery_time_secs'].values)
print(y.shape)
drop_cols = ['created_at','actual_delivery_time','delivery_time_secs','store_primary_category']
data_processed.drop(drop_cols, axis=1, inplace=True)
# Predictors
X = data_processed.values
X.shape
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=43)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
param_grid = {
'n_estimators': [200],
'min_samples_split': [500],
'min_samples_leaf': [100],
'max_depth': [7]
}
gbdt_reg = GradientBoostingRegressor(verbose=10)
print(gbdt_reg)
grid_search = GridSearchCV(gbdt_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
print('# Best Params', grid_search.best_params_)
print('# Best Score', grid_search.best_score_)
# Model Score
print('# Mean Squared Error (Best Score)', np.exp(np.sqrt(-grid_search.best_score_)), 'secs')
print('# Mean Squared Error (Best Score)', np.exp(np.sqrt(-grid_search.best_score_))/60, 'mins')
print('# Mean Squared Error (Best Score)', np.exp(np.sqrt(-grid_search.best_score_))/3600, 'hrs')
#Model Prediction on Test Set
y_hat = grid_search.best_estimator_.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print('# Prediction Error', np.exp(np.sqrt(mse)), 'secs')
print('# Prediction Error', np.exp(np.sqrt(mse))/60, 'mins')
print('# Prediction Error', np.exp(np.sqrt(mse))/3600, 'hrs')
return grid_search.best_estimator_, data_processed.columns
def plot_feature_importance(estimator, features):
feature_importances__reg = estimator.feature_importances_
indices = np.argsort(feature_importances__reg)[::-1]
# Plot feature importance
fig = plt.figure(figsize=(20, 10), dpi=80)
# make importances relative to max importance
feature_importances__reg = 100.0 * (feature_importances__reg / feature_importances__reg.max())
sorted_idx = np.argsort(feature_importances__reg)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importances__reg[sorted_idx], align='center')
plt.yticks(pos, np.array(features)[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Feature Importance')
# Make a copy of baseline dataset
data_processed_1 = data.copy(deep=True)
# Handle missing value
data_processed_1['estimated_store_to_consumer_driving_duration'] = \
data_processed_1['estimated_store_to_consumer_driving_duration'].fillna(0)
X_train, X_test, y_train, y_test, features = train_test_split_np(data_processed_1)
best_estimator_ = train_model(X_train, y_train)
plot_feature_importance(best_estimator_, features)
predict(best_estimator_, X_test, y_test)
# Save Model
dump(best_estimator_, 'gbdt_delovery_time_v1.joblib')
# Create train test split based on date
# Create aggregate features based on past data and transform train/test
data_processed_2 = data.copy(deep=True)
data_processed_2 = data_processed_2.sort_values(by = ['created_at'])
# Take first 80% as training set
cnt = data_processed_2.shape[0]
cnt_train = int(cnt * 0.90)
print(cnt, cnt_train)
data_train = data_processed_2[:cnt_train]
data_test = data_processed_2[cnt_train:]
print(data_train.shape, data_test.shape)
print(data_train['created_at'].head(5))
print(data_train['created_at'].tail(5))
print(data_test['created_at'].head(5))
print(data_test['created_at'].tail(5))
# Create aggregate features from historical data
# Average delivery time secs for market/store
market_map = data_train[['market_id','delivery_time_secs']].groupby(['market_id']).mean().to_dict()
market_map = market_map['delivery_time_secs']
store_map = data_train[['store_id','delivery_time_secs']].groupby(['store_id']).mean().to_dict()
store_map = store_map['delivery_time_secs']
# Average delivery time secs per Day of Week for market/store
market_dow_map = data_train[['market_id','day','delivery_time_secs']].groupby(['market_id','day']).mean().to_dict()
market_dow_map = market_dow_map['delivery_time_secs']
store_dow_map = data_train[['store_id','day','delivery_time_secs']].groupby(['store_id','day']).mean().to_dict()
store_dow_map = store_dow_map['delivery_time_secs']
# Average delivery time sec per Hour of day for market/store
market_hour_map = data_train[['market_id','hour','delivery_time_secs']].groupby(['market_id','hour']).mean().to_dict()
market_hour_map = market_hour_map['delivery_time_secs']
store_hour_map = data_train[['store_id','hour','delivery_time_secs']].groupby(['store_id','hour']).mean().to_dict()
store_hour_map = store_hour_map['delivery_time_secs']
# Average delivery time secs per Day of Week, Hour of Day for market/store
market_dow_hour_map = data_train[['market_id','day','hour','delivery_time_secs']] \
.groupby(['market_id','day','hour']).mean().to_dict()
market_dow_hour_map = market_dow_hour_map['delivery_time_secs']
store_dow_hour_map = data_train[['store_id','day','hour','delivery_time_secs']] \
.groupby(['store_id','day','hour']).mean().to_dict()
store_dow_hour_map = store_dow_hour_map['delivery_time_secs']
"""
Average estimated_store_to_consumer_driving_duration for every store
If data for a store isn't available, roll up to market level
***Better Approach: estimated_store_to_consumer_driving_duration per store (or market) per day of week per hr***
"""
market_estimated_store_to_consumer_driving_duration_map = \
data_train[['market_id','estimated_store_to_consumer_driving_duration']] \
.groupby('market_id').mean().to_dict()
market_estimated_store_to_consumer_driving_duration_map = \
market_estimated_store_to_consumer_driving_duration_map['estimated_store_to_consumer_driving_duration']
print(len(market_estimated_store_to_consumer_driving_duration_map))
# Average estimated_store_to_consumer_driving_duration for every store (if store data isn't available, roll up in market level)
store_estimated_store_to_consumer_driving_duration_map = \
data_train[['store_id','estimated_store_to_consumer_driving_duration']] \
.groupby('store_id').mean().to_dict()
store_estimated_store_to_consumer_driving_duration_map = \
store_estimated_store_to_consumer_driving_duration_map['estimated_store_to_consumer_driving_duration']
print(len(store_estimated_store_to_consumer_driving_duration_map))
# Replace the NAs with average store to consumer driving duration
def add_aggregate_fatures(row):
avg_delivery_time_market = market_map.get(row['market_id'], 0)
avg_delivery_time_store = store_map.get(row['store_id'], 0)
avg_delivery_time_market_dow = market_dow_map.get((row['market_id'],row['day']), 0)
avg_delivery_time_store_dow = store_dow_map.get((row['store_id'],row['day']), 0)
avg_delivery_time_market_hour = market_hour_map.get((row['market_id'],row['hour']), 0)
avg_delivery_time_store_hour = store_hour_map.get((row['store_id'],row['hour']), 0)
avg_delivery_time_market_dow_hour = market_dow_hour_map.get((row['market_id'],row['day'],row['hour']), 0)
avg_delivery_time_store_dow_hour = store_dow_hour_map.get((row['store_id'],row['day'],row['hour']), 0)
return pd.Series((avg_delivery_time_market, avg_delivery_time_store, \
avg_delivery_time_market_dow, avg_delivery_time_store_dow, \
avg_delivery_time_market_hour, avg_delivery_time_store_hour, \
avg_delivery_time_market_dow_hour, avg_delivery_time_store_dow_hour))
cols = ['avg_delivery_time_market', 'avg_delivery_time_store', \
'avg_delivery_time_market_dow', 'avg_delivery_time_store_dow',
'avg_delivery_time_market_hour', 'avg_delivery_time_store_hour',
'avg_delivery_time_market_dow_hour', 'avg_delivery_time_store_dow_hour'
]
# Transform train/test
data_train[cols] = data_train[['market_id','store_id','day','hour','delivery_time_secs']] \
.apply(add_aggregate_fatures, axis=1)
data_test[cols] = data_test[['market_id','store_id','day','hour','delivery_time_secs']] \
.apply(add_aggregate_fatures, axis=1)
# Replace the NAs with average store to consumer driving duration
def estimated_store_to_consumer_driving_duration_missing(row):
ret = row['estimated_store_to_consumer_driving_duration']
if np.isnan(ret):
ret = store_estimated_store_to_consumer_driving_duration_map[row['store_id']]
if np.isnan(ret):
ret = market_estimated_store_to_consumer_driving_duration_map[row['market_id']]
return ret
data_train['estimated_store_to_consumer_driving_duration'] = \
data_train[['market_id','store_id','estimated_store_to_consumer_driving_duration']] \
.apply(estimated_store_to_consumer_driving_duration_missing, axis=1)
data_test['estimated_store_to_consumer_driving_duration'] = \
data_test[['market_id','store_id','estimated_store_to_consumer_driving_duration']] \
.apply(estimated_store_to_consumer_driving_duration_missing, axis=1)
print(data_train.shape, data_test.shape)
for col in cols:
print(col, data_train[data_train[col] == 0].shape, data_test[data_test[col] == 0].shape)
X_train, y_train, features = train_X_y(data_train)
X_test, y_test, features = train_X_y(data_test)
best_estimator_ = train_model(X_train, y_train)
plot_feature_importance(best_estimator_, features)
predict(best_estimator_, X_test, y_test)
# Save Model
dump(best_estimator_, 'gbdt_delovery_time_v2.joblib')
y_hat = best_estimator_.predict(X_test)
y_hat = pd.Series(np.exp(y_hat))
y_hat.describe()
fig = plt.figure(figsize=(10, 10), dpi=80)
pred_df = pd.DataFrame({'y_test':np.log(y_test), 'y_hat':np.log(y_hat)})
sns.scatterplot(x="y_test", y="y_hat", data=pred_df)